European Soccer Database
This soccer database comes from Kaggle and is well suited for data analysis and machine learning.
It contains data for soccer matches, players, and teams from several European countries from 2008 to 2016.
This dataset is quite extensive, and we encourage you to read more about it here.
The database is stored in a SQLite database. You can access database files using software like DB Browser.
This dataset will help you get good practice with your SQL joins.
Make sure to look at how the different tables relate to each other.
Some column titles should be self-explanatory, and others you’ll have to look up on Kaggle.
http://pandas.pydata.org/pandas-docs/version/0.9.1/visualization.html
https://matplotlib.org/gallery.html
https://www.udacity.com/course/intro-to-programming-nanodegree--nd000
https://www.kaggle.com/hugomathien/soccer
How many goals has each team scored home, away and total on each season?
How many goals has each team allowed home, away and total on each season?
How teams improved or deterioted over time and what teams improved or deterioted the most and the least in terms of goals scored or allowed over time?
How many goals has each team allowed home, away and total, from other teams, over all seasons?
Is there such a thing a home advantage?
Which players had the most penalties?
What team attributes lead to the most victories?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns
connection = sqlite3.connect('database.sqlite')
query = '''
select
Country.name as Country,
League.name as League,
Match.season as Season,
date(Match.date) as Date,
Team_home.team_long_name as Team_Home,
Team_away.team_long_name as Team_Away,
Match.home_team_goal as Team_Home_Goal,
Match.away_team_goal as Team_Away_Goal
from
Country, League, Team as Team_home, Team as Team_away, Match
where
Country.id = League.country_id AND
Country.id = Match.country_id AND
League.id = Match.league_id AND
Team_home.team_api_id = Match.home_team_api_id AND
Team_away.team_api_id = Match.away_team_api_id AND
Country = 'Spain' AND
League.name = 'Spain LIGA BBVA'
order by
Match.date
-- limit 10;
'''
#Run query and store results in a dataframe
spain_liga_bbva_df = pd.read_sql(query,connection)
print(spain_liga_bbva_df.columns)
"""
reset_index_pivot_stacked_bar_plot_series(Args) is used for resetting an index of an input series, store result in a dataframe, pivot it so that each Season is a seperate column and finally plot its input columns over its input index.
Args:
plot_type: Type of chart plotted, 'stacked_bar' or 'heat_map'
season_team_scored_allowed_sum: input pandas Series containing Season, Team and sum of goals scored or allowed for each season
index: 'Team_Home' or 'Team_Away' or 'Team' index used for x label plotting
columns: Columns to pivot resulting dataframe, usually 'Season', used as x label subplotting
values: 'Team_Home_Goal' or 'Team_Away_Goal' or 'Team_Goal' or 'Team_Allowed' columns used as values of a plot
title: Title of a plot
xlabel: X axis label of a plot
ylabel: Y axis label of a plot
Returns:
Pivoted DataFrame with each Season in a seperate column.
Raises:
None
"""
def reset_index_pivot_plot_series(plot_type, plot_color, season_team_scored_allowed_sum, \
input_index, input_columns, input_values, title, xlabel, ylabel, reset_flag, pivot_flag):
if reset_flag:
#Reset index of series so 'Season' column is fully populated, store in pandas DataFrame
season_team_scored_allowed_sum = season_team_scored_allowed_sum.reset_index()
if pivot_flag:
#Pivot dataframe so that each Season is a seperate column, needed for multi-series plots
season_team_scored_allowed_sum = season_team_scored_allowed_sum.pivot(index=input_index, columns=input_columns, values=input_values)
if plot_type == 'stacked_bar':
#Plot a stacked bar of final DataFrame with details provided
season_team_scored_allowed_sum.plot(kind='bar', stacked=True, figsize=(20, 15))
elif plot_type == 'heat_map':
#Plot a heat map of final DataFrame with details provided
plt.subplots(figsize=(20,15))
sns.heatmap(season_team_scored_allowed_sum, annot=True, cmap=plot_color)
else:
#Plot a stacked bar of final DataFrame with details provided
season_team_scored_allowed_sum.plot(kind='bar', stacked=True, figsize=(20, 15))
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
#Return final DataFrame
return season_team_scored_allowed_sum
season_team_home_goal_sum = spain_liga_bbva_df.groupby(['Season', 'Team_Home'])['Team_Home_Goal'].sum()
season_team_away_goal_sum = spain_liga_bbva_df.groupby(['Season', 'Team_Away'])['Team_Away_Goal'].sum()
season_team_goal_sum = season_team_home_goal_sum + season_team_away_goal_sum
print type(season_team_home_goal_sum)
print season_team_home_goal_sum.head(30)
season_team_home_goal_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_home_goal_sum, \
'Team_Home', 'Season', 'Team_Home_Goal', 'Home goals scored per Season', 'Team', 'Home Goals Scored', True, True)
season_team_home_goal_sum.head()
season_team_home_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', season_team_home_goal_sum, \
'Team_Home', 'Season', 'Team_Home_Goal', 'Home goals scored per Season', 'Season', 'Team', False, False)
season_team_home_goal_sum_diff = season_team_home_goal_sum.diff(periods=1, axis='columns')
season_team_home_goal_sum_diff = reset_index_pivot_plot_series('heat_map', 'cool', season_team_home_goal_sum_diff, \
'Team_Home', 'Season', 'Team_Home_Goal', 'Home goals scored differentials over Seasons', 'Season', 'Team', False, False)
season_team_away_goal_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_away_goal_sum, \
'Team_Away', 'Season', 'Team_Away_Goal', 'Away goals scored per Season', 'Team', 'Away Goals Scored', True, True)
season_team_away_goal_sum.head()
season_team_away_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', season_team_away_goal_sum, \
'Team_Away', 'Season', 'Team_Away_Goal', 'Away goals scored per Season', 'Season', 'Team', False, False)
season_team_away_goal_sum_diff = season_team_away_goal_sum.diff(periods=1, axis='columns')
season_team_away_goal_sum_diff = reset_index_pivot_plot_series('heat_map', 'cool', season_team_away_goal_sum_diff, \
'Team_Away', 'Season', 'Team_Away_Goal', 'Away goals scored differentials over Seasons', 'Season', 'Team', False, False)
season_team_goal_sum = season_team_goal_sum.reset_index()
season_team_goal_sum.rename(columns={'Season': 'Season', 'Team_Home': 'Team', 0: 'Team_Goal'}, inplace=True)
season_team_goal_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_goal_sum, \
'Team', 'Season', 'Team_Goal', 'Total goals scored per Season', 'Team', 'Total Goals Scored', False, True)
season_team_goal_sum.head()
season_team_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', season_team_goal_sum, \
'Team', 'Season', 'Team_Goal', 'Total goals scored per Season', 'Season', 'Team', False, False)
season_team_goal_sum_diff = season_team_goal_sum.diff(periods=1, axis='columns')
season_team_goal_sum_diff = reset_index_pivot_plot_series('heat_map', 'cool', season_team_goal_sum_diff, \
'Team', 'Season', 'Team_Goal', 'Total goals scored differentials over Seasons', 'Season', 'Team', False, False)
season_team_home_allowed_sum = spain_liga_bbva_df.groupby(['Season', 'Team_Home'])['Team_Away_Goal'].sum()
season_team_away_allowed_sum = spain_liga_bbva_df.groupby(['Season', 'Team_Away'])['Team_Home_Goal'].sum()
season_team_allowed_sum = season_team_home_allowed_sum + season_team_away_allowed_sum
print type(season_team_home_allowed_sum)
print season_team_home_allowed_sum.head(30)
season_team_home_allowed_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_home_allowed_sum, \
'Team_Home', 'Season', 'Team_Away_Goal', 'Home goals allowed per Season', 'Team', 'Home Goals Allowed', True, True)
season_team_home_allowed_sum.head()
season_team_home_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', season_team_home_allowed_sum, \
'Team_Home', 'Season', 'Team_Away_Goal', 'Home goals allowed per Season', 'Season', 'Team', False, False)
season_team_home_allowed_sum_diff = season_team_home_allowed_sum.diff(periods=1, axis='columns')
season_team_home_allowed_sum_diff = reset_index_pivot_plot_series('heat_map', 'magma_r', season_team_home_allowed_sum_diff, \
'Team_Home', 'Season', 'Team_Away_Goal', 'Home goals allowed differentials over Seasons', 'Season', 'Team', False, False)
season_team_away_allowed_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_away_allowed_sum, \
'Team_Away', 'Season', 'Team_Home_Goal', 'Away goals allowed per Season', 'Team', 'Away Goals Allowed', True, True)
season_team_away_allowed_sum.head()
season_team_away_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', season_team_away_allowed_sum, \
'Team_Away', 'Season', 'Team_Home_Goal', 'Away goals allowed per Season', 'Season', 'Team', False, False)
season_team_away_allowed_sum_diff = season_team_away_allowed_sum.diff(periods=1, axis='columns')
season_team_away_allowed_sum_diff = reset_index_pivot_plot_series('heat_map', 'magma_r', season_team_away_allowed_sum_diff, \
'Team_Away', 'Season', 'Team_Home_Goal', 'Away goals allowed differentials over Seasons', 'Season', 'Team', False, False)
season_team_allowed_sum = season_team_allowed_sum.reset_index()
season_team_allowed_sum.rename(columns={'Season': 'Season', 'Team_Home': 'Team', 0: 'Team_Allowed'}, inplace=True)
season_team_allowed_sum = reset_index_pivot_plot_series('stacked_bar', '', season_team_allowed_sum, \
'Team', 'Season', 'Team_Allowed', 'Total goals allowed per Season', 'Team', 'Total Goals Allowed', False, True)
season_team_allowed_sum.head()
season_team_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', season_team_allowed_sum, \
'Team', 'Season', 'Team_Allowed', 'Total goals allowed per Season', 'Season', 'Team', False, False)
season_team_allowed_sum_diff = season_team_allowed_sum.diff(periods=1, axis='columns')
season_team_allowed_sum = reset_index_pivot_plot_series('heat_map', 'magma_r', season_team_allowed_sum, \
'Team', 'Season', 'Team_Allowed', 'Total goals allowed differentials over Seasons', 'Season', 'Team', False, False)
unique_seasons = pd.Series(spain_liga_bbva_df['Season'].unique())
spain_liga_bbva_df.head()
team_match_goal_sum = spain_liga_bbva_df.groupby(['Team_Home', 'Team_Away'])['Team_Home_Goal', 'Team_Away_Goal'].sum()
team_match_goal_sum.head()
team_match_home_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', team_match_goal_sum, \
'Team_Home', 'Team_Away', 'Team_Home_Goal', \
'Home goals scored over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team Away', 'Team Home', True, True)
team_match_home_goal_sum.head()
team_match_away_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', team_match_goal_sum, \
'Team_Away', 'Team_Home', 'Team_Away_Goal', \
'Away goals scored over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team Home', 'Team Away', True, True)
team_match_away_goal_sum.head()
team_match_total_goal_sum = team_match_home_goal_sum + team_match_away_goal_sum
team_match_total_goal_sum = reset_index_pivot_plot_series('heat_map', 'YlGnBu', team_match_total_goal_sum, \
'', '', '', \
'Total goals scored over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team', 'Team', False, False)
team_match_total_goal_sum.head()
team_match_home_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', team_match_goal_sum, \
'Team_Home', 'Team_Away', 'Team_Away_Goal', \
'Home goals allowed over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team Away', 'Team Home', True, True)
team_match_home_allowed_sum.head()
team_match_away_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', team_match_goal_sum, \
'Team_Away', 'Team_Home', 'Team_Home_Goal', \
'Away goals allowed over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team Home', 'Team Away', True, True)
team_match_away_allowed_sum.head()
team_match_total_allowed_sum = team_match_home_allowed_sum + team_match_away_allowed_sum
team_match_total_allowed_sum = reset_index_pivot_plot_series('heat_map', 'RdPu', team_match_total_allowed_sum, \
'', '', '', \
'Total goals allowed over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team', 'Team', False, False)
team_match_total_allowed_sum.head()
"""
categorize_win_tie_loss(team_match_df) is used for categorizing the outcome of match as a win, tie or loss for the home team.
Args:
team_match_df: Team match DataFrame with columns 'Team_Home_Goal' and 'Team_Away_Goal' available.
Returns:
Updated DataFrame with 3 new columns created in case of a match home win, tie or loss.
Raises:
None
"""
def categorize_win_tie_loss(team_match_df):
#Reference
#https://pandas.pydata.org/pandas-docs/stable/indexing.html#setting-with-enlargement
team_match_df.loc[:, 'Team_Home_Win'] = 0
team_match_df.loc[:, 'Team_Home_Tie'] = 0
team_match_df.loc[:, 'Team_Away_Win'] = 0
team_match_df.loc[team_match_df['Team_Home_Goal'] > team_match_df['Team_Away_Goal'], 'Team_Home_Win'] = 1
team_match_df.loc[team_match_df['Team_Home_Goal'] == team_match_df['Team_Away_Goal'], 'Team_Home_Tie'] = 1
team_match_df.loc[team_match_df['Team_Home_Goal'] < team_match_df['Team_Away_Goal'], 'Team_Away_Win'] = 1
return team_match_df
spain_liga_bbva_df = categorize_win_tie_loss(spain_liga_bbva_df)
spain_liga_bbva_df.head()
team_match_outcome_sum = spain_liga_bbva_df.groupby(['Team_Home', 'Team_Away']) \
['Team_Home_Win', 'Team_Home_Tie', 'Team_Away_Win'].sum()
team_match_outcome_sum.head()
team_match_outcome_sum = team_match_outcome_sum.reset_index()
team_match_outcome_sum = team_match_outcome_sum.set_index(['Team_Home', 'Team_Away'])
"""
add_home_and_away_wins_ties(team_match_df) is used to keep track of wins earned home or away for each pair of teams.
Similalry for ties, home or away.
Args:
team_match_df: Team match DataFrame with columns ['Team_Home', 'Team_Away', 'Team_Home_Win', 'Team_Home_Tie', 'Team_Away_Win'] available.
Returns:
Updated DataFrame with 2 new columns created, one for total wins, one for total ties.
Raises:
None
"""
def add_home_and_away_wins_ties(team_match_df):
team_match_df.loc[:, 'Team_Win'] = 0
team_match_df.loc[:, 'Team_Tie'] = 0
for pair in team_match_outcome_sum.index:
team_match_df.loc[(pair[0], pair[1]), :]['Team_Win'] = team_match_df.loc[(pair[0], pair[1]), :]['Team_Home_Win'] + team_match_df.loc[(pair[1], pair[0]), :]['Team_Away_Win']
team_match_df.loc[(pair[0], pair[1]), :]['Team_Tie'] = team_match_df.loc[(pair[0], pair[1]), :]['Team_Home_Tie'] + team_match_df.loc[(pair[1], pair[0]), :]['Team_Home_Tie']
return team_match_df
team_match_outcome_sum = add_home_and_away_wins_ties(team_match_outcome_sum)
team_match_outcome_sum.head()
team_match_win_sum = reset_index_pivot_plot_series('heat_map', 'autumn_r', team_match_outcome_sum, \
'Team_Home', 'Team_Away', 'Team_Win', \
'Wins over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team', 'Team', True, True)
team_match_win_sum.head()
team_match_tie_sum = reset_index_pivot_plot_series('heat_map', 'rainbow', team_match_outcome_sum, \
'Team_Home', 'Team_Away', 'Team_Tie', \
'Home ties over '+str(unique_seasons.size)+' Seasons: '+unique_seasons.iloc[0]+' to '+unique_seasons.iloc[-1], \
'Team', 'Team', True, True)
team_match_tie_sum.head()
spain_liga_bbva_stats = spain_liga_bbva_df.describe()
spain_liga_bbva_stats['Home_Advantage'] = spain_liga_bbva_stats['Team_Home_Goal'] / spain_liga_bbva_stats['Team_Away_Goal']
spain_liga_bbva_stats
...
Country = 'Spain' AND
League.name = 'Spain LIGA BBVA'
...
FCB = 'FC Barcelona'
RMCF = 'Real Madrid CF'
spain_liga_bbva_df[((spain_liga_bbva_df['Team_Home'] == FCB) & (spain_liga_bbva_df['Team_Away'] == RMCF)) \
| ((spain_liga_bbva_df['Team_Home'] == RMCF) & (spain_liga_bbva_df['Team_Away'] == FCB))]
spain_liga_bbva_df[((spain_liga_bbva_df['Team_Home'] == FCB) & (spain_liga_bbva_df['Team_Away'] == RMCF)) ][['Team_Home_Goal','Team_Away_Goal']].sum()
spain_liga_bbva_df[((spain_liga_bbva_df['Team_Home'] == RMCF) & (spain_liga_bbva_df['Team_Away'] == FCB)) ][['Team_Home_Goal','Team_Away_Goal']].sum()
team_match_win_sum.loc[(FCB, RMCF)]
team_match_win_sum.loc[(RMCF, FCB)]
team_match_tie_sum.loc[(FCB, RMCF)] #same as team_match_tie_sum.loc[(RMCF, FCB)] since matrix team_match_tie_sum is symmetric
https://matplotlib.org/examples/color/colormaps_reference.html
#plot_colors = ['Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Vega10', 'Vega10_r', 'Vega20', 'Vega20_r', 'Vega20b', 'Vega20b_r', 'Vega20c', 'Vega20c_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spectral', 'spectral_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r']
plot_colors = ['YlGnBu', 'cool', 'RdPu', 'magma_r', 'plasma_r']
for plot_color in plot_colors:
season_team_home_goal_sum = reset_index_pivot_plot_series('heat_map', plot_color, season_team_home_goal_sum, \
'Team_Home', 'Season', 'Team_Home_Goal', plot_color + ' - Home goals scored per Season', 'Team', 'Home Goals Scored', False, False)